source('../../workflow/resources/annotateVariants.R')
sampleName <- 'Br7'
inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder'
annotations <- annotate_variants(sampleName, inputFolder)
For each cluster (defined by color), we computed a pairwise distance for each mutation pair that indicates how often the two mutations occur in the same private branch of cells from the cluster:
dist(M1, M2) = 0 (for M1 = M2)
dist(M1,M2) = 1 - (%samples where M1 and M2 are both in the same private branch of a cell from the cluster) (elsewise)
A private branch is defined as the path from a leaf to the node just below the LCA of this leaf to another leaf from the same cluster.
This is a generalization of the earlier method to find the top seperating mutations of pairs of leafs. The generalization was necessary to handle the larger clusters that were broken in more than 2 pieces.
clusterName <- 'lightcoral'
d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
mat<-as.matrix(d)
mat[1:4, 1:4]
## chr11_83486152 chr2_151464674 chrX_68513342 chr3_75669440
## chr11_83486152 0.000000 0.951350 0.946925 0.93910
## chr2_151464674 0.951350 0.000000 0.680475 0.64780
## chrX_68513342 0.946925 0.680475 0.000000 0.50795
## chr3_75669440 0.939100 0.647800 0.507950 0.00000
For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants.
coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
coverage$variantName <- rownames(coverage)
head(coverage)
## covScore variantName
## chr11_83486152 0.5454545 chr11_83486152
## chr2_151464674 0.5454545 chr2_151464674
## chrX_68513342 0.5454545 chrX_68513342
## chr3_75669440 0.8181818 chr3_75669440
## chr19_8896139 0.6363636 chr19_8896139
## chr6_32521852 0.6363636 chr6_32521852
coverage <- inner_join(coverage, annotations, by = "variantName")
###Overview To get an overview, we plot the full distance matrix:
library(heatmaply)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: viridis
## Loading required package: viridisLite
##
## ======================
## Welcome to heatmaply version 1.5.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags:
## https://stackoverflow.com/questions/tagged/heatmaply
## ======================
heatmaply(mat)
mat2 <- mat
diag(mat2) <- 1
min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations
selected_muts <- which(min_dist<0.95) # select those below 0.5 say
mat3 <- mat[selected_muts, selected_muts]
This is what the distance matrix looks like now:
heatmaply(mat3)
coverage %>% filter(variantName %in% colnames(mat2))
## covScore variantName REF ALT relevant
## 1 0.5454545 chr11_83486152 C T NONE
## 2 0.5454545 chr2_151464674 C T NONE
## 3 0.5454545 chrX_68513342 G A MODERATE
## 4 0.8181818 chr3_75669440 G A NONE
## 5 0.6363636 chr19_8896139 T A NONE
## 6 0.6363636 chr6_32521852 C T NONE
## 7 0.5454545 chr2_89085291 A G NONE
## 8 0.3636364 chr1_248914055 G T NONE
## 9 0.2727273 chr3_49112500 C G MODERATE
## 10 0.4545455 chr14_106324346 G A NONE
## 11 0.4545455 chr12_48773428 G T NONE
## 12 0.4545455 chr22_26494024 G A NONE
## 13 0.4545455 chr9_36951953 G A NONE
## 14 0.5454545 chr9_72694490 A G NONE
## 15 0.3636364 chr20_3615755 C T NONE
## 16 0.6363636 chr1_16583576 C T NONE
## 17 0.4545455 chr21_10543405 C T NONE
## 18 0.5454545 chr1_177032551 C T MODERATE
## 19 0.2727273 chr2_71071933 T G NONE
## 20 0.3636364 chr10_46940776 A G NONE
## 21 0.6363636 chr12_61753708 G A MODERATE
## 22 0.3636364 chr3_52443021 G C MODERATE
## 23 0.5454545 chr17_31231738 C T NONE
## 24 0.6363636 chr12_81405870 G T MODERATE
## 25 0.5454545 chr3_108394131 T C MODERATE
## 26 0.5454545 chr2_169270911 A G NONE
## 27 0.6363636 chr12_11133790 G A MODERATE
## 28 0.4545455 chr12_66226858 T C NONE
## 29 0.4545455 chr6_32127361 T C NONE
## 30 0.5454545 chr19_54788354 T A NONE
## 31 0.7272727 chr10_48966334 C T NONE
## 32 0.8181818 chr3_75669281 C T NONE
## 33 0.5454545 chr1_203165042 T G NONE
## 34 0.6363636 chr20_30399358 A G NONE
## 35 0.5454545 chr12_51507491 T G NONE
## 36 0.5454545 chr10_97901536 G A MODERATE
## 37 0.4545455 chr9_35148405 C T NONE
## 38 0.6363636 chr5_150124261 C T MODERATE
## 39 0.4545455 chr9_33034269 G T HIGH
## 40 0.7272727 chr14_106639293 G T MODERATE
To cluster mutations, we create a dendrogram based on the pairwise distances:
mat <- mat3
d_mat <- as.dist(mat)
hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix
par(cex=0.6)
plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations")
No apparent clustering visible.
clusterName <- 'sandybrown'
d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
mat<-as.matrix(d)
mat[1:4, 1:4]
## chr11_83486152 chr2_151464674 chrX_68513342 chr3_75669440
## chr11_83486152 0 1.00000 1.000000 1.000000
## chr2_151464674 1 0.00000 1.000000 0.999950
## chrX_68513342 1 1.00000 0.000000 0.999975
## chr3_75669440 1 0.99995 0.999975 0.000000
For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants.
coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
coverage$variantName <- rownames(coverage)
head(coverage)
## covScore variantName
## chr11_83486152 0.5454545 chr11_83486152
## chr2_151464674 0.5454545 chr2_151464674
## chrX_68513342 0.5454545 chrX_68513342
## chr3_75669440 0.8181818 chr3_75669440
## chr19_8896139 0.6363636 chr19_8896139
## chr6_32521852 0.6363636 chr6_32521852
coverage <- inner_join(coverage, annotations, by = "variantName")
###Overview To get an overview, we plot the full distance matrix:
library(heatmaply)
heatmaply(mat)
coverage %>% filter(variantName %in% colnames(mat2))
## covScore variantName REF ALT relevant
## 1 0.5454545 chr11_83486152 C T NONE
## 2 0.5454545 chr2_151464674 C T NONE
## 3 0.5454545 chrX_68513342 G A MODERATE
## 4 0.8181818 chr3_75669440 G A NONE
## 5 0.6363636 chr19_8896139 T A NONE
## 6 0.6363636 chr6_32521852 C T NONE
## 7 0.5454545 chr2_89085291 A G NONE
## 8 0.3636364 chr1_248914055 G T NONE
## 9 0.2727273 chr3_49112500 C G MODERATE
## 10 0.4545455 chr14_106324346 G A NONE
## 11 0.4545455 chr12_48773428 G T NONE
## 12 0.4545455 chr22_26494024 G A NONE
## 13 0.4545455 chr9_36951953 G A NONE
## 14 0.5454545 chr9_72694490 A G NONE
## 15 0.3636364 chr20_3615755 C T NONE
## 16 0.6363636 chr1_16583576 C T NONE
## 17 0.4545455 chr21_10543405 C T NONE
## 18 0.5454545 chr1_177032551 C T MODERATE
## 19 0.2727273 chr2_71071933 T G NONE
## 20 0.3636364 chr10_46940776 A G NONE
## 21 0.6363636 chr12_61753708 G A MODERATE
## 22 0.3636364 chr3_52443021 G C MODERATE
## 23 0.5454545 chr17_31231738 C T NONE
## 24 0.6363636 chr12_81405870 G T MODERATE
## 25 0.5454545 chr3_108394131 T C MODERATE
## 26 0.5454545 chr2_169270911 A G NONE
## 27 0.6363636 chr12_11133790 G A MODERATE
## 28 0.4545455 chr12_66226858 T C NONE
## 29 0.4545455 chr6_32127361 T C NONE
## 30 0.5454545 chr19_54788354 T A NONE
## 31 0.7272727 chr10_48966334 C T NONE
## 32 0.8181818 chr3_75669281 C T NONE
## 33 0.5454545 chr1_203165042 T G NONE
## 34 0.6363636 chr20_30399358 A G NONE
## 35 0.5454545 chr12_51507491 T G NONE
## 36 0.5454545 chr10_97901536 G A MODERATE
## 37 0.4545455 chr9_35148405 C T NONE
## 38 0.6363636 chr5_150124261 C T MODERATE
## 39 0.4545455 chr9_33034269 G T HIGH
## 40 0.7272727 chr14_106639293 G T MODERATE
To cluster mutations, we create a dendrogram based on the pairwise distances:
d_mat <- as.dist(mat)
hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix
par(cex=0.6)
plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations")
There seems to be no signal at all, so I stop here.
clusterName <- 'skyblue3'
d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
mat<-as.matrix(d)
mat[1:4, 1:4]
## chr11_83486152 chr2_151464674 chrX_68513342 chr3_75669440
## chr11_83486152 0.00000 0.939350 0.924950 0.930250
## chr2_151464674 0.93935 0.000000 0.468875 0.464350
## chrX_68513342 0.92495 0.468875 0.000000 0.281575
## chr3_75669440 0.93025 0.464350 0.281575 0.000000
For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants.
coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
coverage$variantName <- rownames(coverage)
head(coverage)
## covScore variantName
## chr11_83486152 0.5454545 chr11_83486152
## chr2_151464674 0.5454545 chr2_151464674
## chrX_68513342 0.5454545 chrX_68513342
## chr3_75669440 0.8181818 chr3_75669440
## chr19_8896139 0.6363636 chr19_8896139
## chr6_32521852 0.6363636 chr6_32521852
coverage <- inner_join(coverage, annotations, by = "variantName")
###Overview To get an overview, we plot the full distance matrix:
library(heatmaply)
heatmaply(mat)
mat2 <- mat
diag(mat2) <- 1
min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations
selected_muts <- which(min_dist<0.6) # select those below 0.5 say
mat3 <- mat[selected_muts, selected_muts]
This is what the distance matrix looks like now:
heatmaply(mat3)
coverage %>% filter(variantName %in% colnames(mat2))
## covScore variantName REF ALT relevant
## 1 0.5454545 chr11_83486152 C T NONE
## 2 0.5454545 chr2_151464674 C T NONE
## 3 0.5454545 chrX_68513342 G A MODERATE
## 4 0.8181818 chr3_75669440 G A NONE
## 5 0.6363636 chr19_8896139 T A NONE
## 6 0.6363636 chr6_32521852 C T NONE
## 7 0.5454545 chr2_89085291 A G NONE
## 8 0.3636364 chr1_248914055 G T NONE
## 9 0.2727273 chr3_49112500 C G MODERATE
## 10 0.4545455 chr14_106324346 G A NONE
## 11 0.4545455 chr12_48773428 G T NONE
## 12 0.4545455 chr22_26494024 G A NONE
## 13 0.4545455 chr9_36951953 G A NONE
## 14 0.5454545 chr9_72694490 A G NONE
## 15 0.3636364 chr20_3615755 C T NONE
## 16 0.6363636 chr1_16583576 C T NONE
## 17 0.4545455 chr21_10543405 C T NONE
## 18 0.5454545 chr1_177032551 C T MODERATE
## 19 0.2727273 chr2_71071933 T G NONE
## 20 0.3636364 chr10_46940776 A G NONE
## 21 0.6363636 chr12_61753708 G A MODERATE
## 22 0.3636364 chr3_52443021 G C MODERATE
## 23 0.5454545 chr17_31231738 C T NONE
## 24 0.6363636 chr12_81405870 G T MODERATE
## 25 0.5454545 chr3_108394131 T C MODERATE
## 26 0.5454545 chr2_169270911 A G NONE
## 27 0.6363636 chr12_11133790 G A MODERATE
## 28 0.4545455 chr12_66226858 T C NONE
## 29 0.4545455 chr6_32127361 T C NONE
## 30 0.5454545 chr19_54788354 T A NONE
## 31 0.7272727 chr10_48966334 C T NONE
## 32 0.8181818 chr3_75669281 C T NONE
## 33 0.5454545 chr1_203165042 T G NONE
## 34 0.6363636 chr20_30399358 A G NONE
## 35 0.5454545 chr12_51507491 T G NONE
## 36 0.5454545 chr10_97901536 G A MODERATE
## 37 0.4545455 chr9_35148405 C T NONE
## 38 0.6363636 chr5_150124261 C T MODERATE
## 39 0.4545455 chr9_33034269 G T HIGH
## 40 0.7272727 chr14_106639293 G T MODERATE
To cluster mutations, we create a dendrogram based on the pairwise distances:
mat <- mat3
d_mat <- as.dist(mat)
hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix
par(cex=0.6)
plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations")
No apparent clustering visible.